Quick example of Latent Profile Analysis in R
library(tidyverse)
survey <- read_csv("https://raw.githubusercontent.com/whipson/tidytuesday/master/young_people.csv") %>%
select(History:Pets)
dim(survey)
## [1] 1010 32
head(survey)
careless packagelibrary(careless)
library(psych)
interests <- survey %>%
mutate(string = longstring(.)) %>%
mutate(md = outlier(., plot = FALSE))
careless::longstring : identifies the longest string of identical consecutive responses for each observationpsych::outlier : find and graph Mahalanobis squared distances to detect outliershead(interests)
cutoff <- (qchisq(p = 1 - .001, df = ncol(interests)))
cutoff
## [1] 65.24722
interests_clean <- interests %>%
filter(string <= 10,
md < cutoff) %>%
select(-string, -md)
dim(interests_clean)
## [1] 997 32
head(interests_clean)
mclust
library(mclust)
interests_clustering <- interests_clean %>%
na.omit() %>%
mutate_all(list(scale)) # scale to see the differences more clearly
head(interests_clustering)
BIC <- mclustBIC(interests_clustering)
plot(BIC)
mclustModelNames
## function (model)
## {
## type <- switch(EXPR = as.character(model), E = "univariate, equal variance",
## V = "univariate, unequal variance", EII = "spherical, equal volume",
## VII = "spherical, varying volume", EEI = "diagonal, equal volume and shape",
## VEI = "diagonal, equal shape", EVI = "diagonal, equal volume, varying shape",
## VVI = "diagonal, varying volume and shape", EEE = "ellipsoidal, equal volume, shape and orientation",
## EVE = "ellipsoidal, equal volume and orientation", VEE = "ellipsoidal, equal shape and orientation",
## VVE = "ellipsoidal, equal orientation", EEV = "ellipsoidal, equal volume and shape",
## VEV = "ellipsoidal, equal shape", EVV = "ellipsoidal, equal volume",
## VVV = "ellipsoidal, varying volume, shape, and orientation",
## X = "univariate normal", XII = "spherical multivariate normal",
## XXI = "diagonal multivariate normal", XXX = "ellipsoidal multivariate normal",
## warning("invalid model"))
## return(list(model = model, type = type))
## }
## <bytecode: 0x0000000018554338>
## <environment: namespace:mclust>
summary(BIC)
## Best BIC values:
## VVE,3 VEE,3 EVE,3
## BIC -75042.7 -75165.1484 -75179.165
## BIC diff 0.0 -122.4442 -136.461
mod1 <- Mclust(interests_clustering, modelNames = "VEE", G = 3, x = BIC)
summary(mod1)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VEE (ellipsoidal, equal shape and orientation) model with 3
## components:
##
## log-likelihood n df BIC ICL
## -35455.83 874 628 -75165.15 -75216.14
##
## Clustering table:
## 1 2 3
## 137 527 210
ICL <- mclustICL(interests_clustering)
plot(ICL)
summary(ICL)
## Best ICL values:
## VVE,3 VEE,3 EVE,3
## ICL -75134.69 -75216.13551 -75272.891
## ICL diff 0.00 -81.44795 -138.203
mclustBootstrapLRT(interests_clustering, modelName = "VEE")
## -------------------------------------------------------------
## Bootstrap sequential LRT for the number of mixture components
## -------------------------------------------------------------
## Model = VEE
## Replications = 999
## LRTS bootstrap p-value
## 1 vs 2 197.0384 0.001
## 2 vs 3 684.8743 0.001
## 3 vs 4 -124.1935 1.000
library(reshape2)
means <- data.frame(mod1$parameters$mean, stringsAsFactors = FALSE) %>%
rownames_to_column() %>%
rename(Interest = rowname) %>%
melt(id.vars = "Interest", variable.name = "Profile", value.name = "Mean") %>%
mutate(Mean = round(Mean, 2),
Mean = ifelse(Mean > 1, 1, Mean))
round(table(mod1$classification)/nrow(interests_clustering),2)
##
## 1 2 3
## 0.16 0.60 0.24
means
means %>%
ggplot(aes(Interest, Mean, group = Profile, color = Profile)) +
geom_point(size = 2.25) +
geom_line(size = 1.25) +
scale_x_discrete(limits = c("Active sport", "Adrenaline sports", "Passive sport",
"Countryside, outdoors", "Gardening", "Cars",
"Art exhibitions", "Dancing", "Musical instruments",
"Theatre", "Writing", "Reading","Geography",
"History", "Law", "Politics", "Psychology", "Religion",
"Foreign languages", "Biology", "Chemistry", "Mathematics",
"Medicine", "Physics", "Science and technology",
"Internet", "PC","Celebrities", "Economy Management",
"Fun with friends", "Shopping", "Pets")) +
labs(x = NULL, y = "Standardized mean interest") +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "top")
library(knitr)
library(kableExtra)
data <- data.frame(mod1$parameters$mean, stringsAsFactors = FALSE) %>%
rownames_to_column() %>%
rename(Interest = rowname) %>%
mutate(X1 = round(X1, 2),
X1 = ifelse(X1 > 1, 1, X1),
X2 = round(X2, 2),
X2 = ifelse(X2 > 1, 1, X2),
X3 = round(X3, 2),
X3 = ifelse(X3 > 1, 1, X3))
data %>%
mutate(
X1 = cell_spec(X1, "html", color = ifelse(X1 > 0.3, "red", "black")),
X2 = cell_spec(X2, "html", color = ifelse(X2 > 0.3, "red", "black")),
X3 = cell_spec(X3, "html", color = ifelse(X3 > 0.3, "red", "black"))) %>%
kable(format = "html", escape = F) %>%
kable_styling("striped", full_width = F)
| Interest | X1 | X2 | X3 |
|---|---|---|---|
| History | -0.11 | -0.08 | 0.28 |
| Psychology | -0.13 | -0.13 | 0.43 |
| Politics | -0.21 | -0.01 | 0.15 |
| Mathematics | 0.03 | 0 | -0.03 |
| Physics | 0.25 | -0.1 | 0.08 |
| Internet | -0.14 | 0.08 | -0.11 |
| PC | -0.18 | 0.08 | -0.09 |
| Economy Management | -0.48 | 0.15 | -0.06 |
| Biology | 1 | -0.35 | 0.01 |
| Chemistry | 1 | -0.42 | -0.08 |
| Reading | 0.18 | -0.23 | 0.47 |
| Geography | -0.1 | -0.06 | 0.23 |
| Foreign languages | -0.1 | -0.06 | 0.23 |
| Medicine | 1 | -0.37 | 0.06 |
| Law | -0.13 | -0.02 | 0.14 |
| Cars | -0.09 | 0.14 | -0.3 |
| Art exhibitions | -0.12 | -0.18 | 0.53 |
| Religion | 0.05 | -0.11 | 0.25 |
| Countryside, outdoors | 0.06 | -0.03 | 0.03 |
| Dancing | 0.16 | -0.12 | 0.21 |
| Musical instruments | 0.01 | -0.18 | 0.47 |
| Writing | -0.41 | -0.52 | 1 |
| Passive sport | -0.02 | 0.07 | -0.18 |
| Active sport | -0.01 | 0 | 0 |
| Gardening | 0.28 | -0.19 | 0.29 |
| Celebrities | -0.03 | 0.02 | -0.03 |
| Shopping | 0.02 | -0.01 | 0.01 |
| Science and technology | 0.26 | -0.07 | 0 |
| Theatre | 0.02 | -0.15 | 0.36 |
| Fun with friends | 0.12 | 0.02 | -0.13 |
| Adrenaline sports | -0.06 | 0 | 0.04 |
| Pets | 0.28 | -0.08 | 0.01 |
p <- means %>%
mutate(Profile = recode(Profile,
X1 = "Science: 16%",
X2 = "Disinterest: 60%",
X3 = "Arts & Humanities: 24%")) %>%
ggplot(aes(Interest, Mean, group = Profile, color = Profile)) +
geom_point(size = 2.25) +
geom_line(size = 1.25) +
scale_x_discrete(limits = c("Active sport", "Adrenaline sports", "Passive sport",
"Countryside, outdoors", "Gardening", "Cars",
"Art exhibitions", "Dancing", "Musical instruments",
"Theatre", "Writing", "Reading", "Geography",
"History", "Law", "Politics", "Psychology", "Religion",
"Foreign languages", "Biology", "Chemistry", "Mathematics",
"Medicine", "Physics", "Science and technology",
"Internet", "PC", "Celebrities", "Economy Management",
"Fun with friends", "Shopping", "Pets")) +
labs(x = NULL, y = "Standardized mean interest") +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "top")
p
library(plotly)
ggplotly(p, tooltip = c("Interest", "Mean")) %>%
layout(legend = list(orientation = "h", y = 1.2))